keep = ls()

#########################
#Matching CWDB to Census#
#########################

####################################
#matching enlistment to 1860 census#
####################################

#Load cleaned data
cwdb_census_match = fread("./cleaned/cwdb_to_match.csv", na.strings = c("NA", ""))
census_1860_match = fread("./cleaned/census_1860_to_match.csv", na.strings = c("NA", ""))

#Set birth year to numeric
census_1860_match[, birth_year := as.numeric(birth_year)]
cwdb_census_match[, birth_year := as.numeric(birth_year)]


if (run_fastlink) {
  #fastlink: block by county
  block_out <- blockData(cwdb_census_match, census_1860_match, varnames = "census_county")
  matches_list = vector(mode = 'list', length = length(block_out))
  
  #looping over census counties, 
  #match on cleaned first, last, middle name, first sound, last sound, birth year
  for (i in seq_along(block_out)) {
    print(i)
    x = block_out[[i]]
    out = try(
      fastLink(dfA = cwdb_census_match[x$dfA.inds,],
               dfB = census_1860_match[x$dfB.inds,],
               varnames = c("match_first_clean", "match_last", 'match_middle_init', 'birth_year', 'first_sound', 'last_sound'),
               stringdist.match = c("match_first_clean", "match_last"),
               numeric.match = c('birth_year'),
               partial.match = c("match_first_clean", "match_last", 'birth_year'),
               cut.a.num = 1.5,
               cut.p.num = 3.5,
               cut.a = 0.94,
               cut.p = 0.85,
               n.cores = 9,
               cond.indep = T,
               return.all = F,
               dedupe.matches = F,
               threshold.match = 0.1
               
      )
    )
    if (inherits(out, "try-error")) {
      out = NULL
      print("DID NOT RUN")
    }
    matches_list[[i]] = out
  }
  
  #Get matches from each county
  m_out = mapply(function(x,m) getMatches(cwdb_census_match[x$dfA.inds,], 
                                          census_1860_match[x$dfB.inds,],
                                          m, threshold.match = 0.1) %>% as.data.table,
                 x = block_out, m = matches_list, SIMPLIFY = F)
  m_out = m_out %>% rbindlist
  
  #Keep matches if first name is full/partial/sound match
  #AND last name is full/partial/sound match
  #identify best probability match for person 
  m_out = m_out[(gamma.1 %in% 1:2 | gamma.5 %in% 2) &
                  (gamma.2 %in% 1:2 | gamma.6 %in% 2),] %>% 
    .[, best := posterior == max(posterior) , by = personpk]
  
  #Keep best match if probability of match > 0.8
  #generate weight of match
  cwdb_census_matches = m_out[(best) & posterior > 0.8, 
                              list(personpk, mpcid, posterior,
                                   match_first, match_first_clean,
                                   match_middle, match_middle_init,
                                   match_last, 
                                   first_sound, last_sound,
                                   birth_place,
                                   birth_year,
                                   census_county)] %>% unique %>%
    .[, wt := posterior / sum(posterior), by = personpk]
  
  #add in census name and birth year
  census_cwdb_matches = census_1860_match[mpcid %in% unique(cwdb_census_matches$mpcid),
                                          list(mpcid,
                                               match_first,
                                               match_first_clean,
                                               match_middle, match_middle_init,
                                               match_last, 
                                               first_sound, last_sound,
                                               birth_place,
                                               birth_year,
                                               census_county
                                          )]
  
  #save cwdb-census links
  cwdb_census_matches[, list(personpk, mpcid, census_county, census_wt = wt)] %>%
    fwrite(., "./cleaned/cwdb_to_census_links.csv")
}

##########################################
#Exact Matching: CWDB to 1874 directories#
##########################################

#Restrict to soldiers surviving the war:
cwdb_census_matches_to_1874 = cwdb_census_match[survivedwar != "N"]
cwdb_census_matches_to_1874[, birth_year := as.numeric(birth_year)]

#Load directory data:
directory_1874_match = fread("./cleaned/directory_to_match.csv", na.strings = c("NA", ""))
directory_1874_match[, birth_year := as.numeric(birth_year)]

#get first initials
cwdb_census_matches_to_1874[, first_initial := str_sub(match_first_clean, 1, 1)]
directory_1874_match[, first_initial := str_sub(match_first_clean, 1, 1)]


#hold matches
match_list_exact = vector(mode = 'list', length = nrow(cwdb_census_matches_to_1874))

#Loop over surviving soldiers
for (i in 1:nrow(cwdb_census_matches_to_1874)) {
  row = cwdb_census_matches_to_1874[i,]
  #match people to those living in same county in 1874
  sub = directory_1874_match[census_county %in% row[, census_county] ]
  
  #If there is a first name
  if (!is.na(row$match_first_clean)) {
    #if first name is give, not just initial
    if (str_length(row$match_first_clean) > 1) {
      sub[, paste0('gamma.', 1:5) := list(0,0,0,0,0)]
      sub[(first_sound %in% row$first_sound), gamma.3 := 2]
      sub[(last_sound %in% row$last_sound), gamma.4 := 2]
      sub[stringdist(match_first_clean, row$match_first_clean, method = 'jw') < 0.1, gamma.1 := 1]
      sub[stringdist(match_last, row$match_last, method = 'jw') < 0.1, gamma.2 := 1]
      sub[(match_first_clean %in% row$match_first_clean), gamma.1 := 2]
      sub[(match_last %in% row$match_last), gamma.2 := 2]
      sub[(match_first_clean == first_initial) & 
            (first_initial == row$first_initial), gamma.5 := 2]
      #keep as potential matches
      #if first name is exact/close match or sound alike 
      #or if first initial matches and one entry has only initials
      #and last name is exact/close or sounds alike
      out = sub[(gamma.1 %in% 1:2 | gamma.3 %in% 2 | gamma.5 %in% 2) & 
                  (gamma.2 %in% 1:2 | gamma.4 %in% 2)]
    }
    #If there is only an initial
    if (str_length(row$match_first_clean) == 1) {
      sub[, paste0('gamma.', 1:5) := list(0,0,0,0,0)]
      sub[(last_sound %in% row$last_sound), gamma.4 := 2]
      sub[stringdist(match_last, row$match_last, method = 'jw') < 0.1, gamma.2 := 1]
      sub[(match_last %in% row$match_last), gamma.2 := 2]
      sub[(first_initial == row$first_initial), gamma.5 := 2]
      #Keep match if first initial matches and last name is partial/full/sound match
      out = sub[(gamma.5 %in% 2) & 
                  (gamma.2 %in% 1:2 | gamma.4 %in% 2)]
    }
    out[, personpk := row[, personpk]]
    match_list_exact[[i]] = out
  }
}

#Create match list
cwdb_1874_matches = rbindlist(match_list_exact) %>% unique
setkey(cwdb_1874_matches, personpk)
setkey(cwdb_census_matches_to_1874, personpk)

#Merge in cwdb data
cwdb_1874_matches = cwdb_census_matches_to_1874[cwdb_1874_matches, allow.cartesian = T]

#Find age difference/birth place match
cwdb_1874_matches[, birth_dist := abs(birth_year - i.birth_year)]

#Prepare matches for filtering
cwdb_1874_matches = cwdb_1874_matches[, list(personpk, image_url, settlement_year, 
                                             gamma.1, gamma.2, gamma.3, gamma.4, gamma.5, use_quality,
                                             birth_dist,
                                             party)]

#get unique matches
cwdb_1874_matches = cwdb_1874_matches[, list(personpk,
                                              image_url,
                                              id_1874 = str_extract(image_url, "(?<=[/])[^./]+(?=\\.png$)"),
                                              party,
                                              gamma.1, gamma.2, gamma.3, gamma.4, gamma.5, birth_dist)] %>% unique

#Extract county
cwdb_1874_matches[, census_county := id_1874 %>% 
                    str_extract("^[a-z]+") %>%
                    str_to_title()]

#Identify POSSIBLE matches
#if FN, LN exact
#if FN exact, LN partial or sound
#if FN partial or sound, LN exact
#if FN is initial, LN is exact
cwdb_1874_matches[, match_q_1874 := (gamma.1 %in% 2 & gamma.2 %in% 2) |
                    (gamma.1 %in% 2 & (gamma.2 %in% 1 | gamma.4 %in% 2)) |
                    ((gamma.1 %in% 1 | gamma.3 %in% 2) & gamma.2 %in% 2) |
                    (gamma.5 %in% 2 & gamma.2 %in% 2)]
#Identify best matches: closest on most criteria
cwdb_1874_matches[, score := gamma.1 + gamma.2 + gamma.3 + gamma.4 + gamma.5 + ifelse(!is.na(birth_dist), (birth_dist < 2) + (birth_dist < 4), 0)]
cwdb_1874_matches[, best := score == max(score) , by = list(personpk, census_county)]

#Potential matches to exclude from training
cwdb_1874_matches[, use := F]
#potential match if first name match is full/partial/sound
#and ln match is full/partial/sound
cwdb_1874_matches[gamma.5 %in% 0, use := T]
#potential match if first initial match 
#and ln match is full/partial/sound
#and birth year matches within 3 years
cwdb_1874_matches[gamma.5 %in% 2 & birth_dist <= 3, use := T]

#Save matching results
fwrite(cwdb_1874_matches, "./cleaned/cwdb_to_1874_links.csv")

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()
